library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.2 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.3 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library(broom)
library(mgcv)
## Loading required package: nlme
##
## Attaching package: 'nlme'
##
## The following object is masked from 'package:dplyr':
##
## collapse
##
## This is mgcv 1.8-42. For overview type 'help("mgcv-package")'.
library(metR)
##
## Attaching package: 'metR'
##
## The following object is masked from 'package:purrr':
##
## cross
library(ggmap)
## ℹ Google's Terms of Service: <https://mapsplatform.google.com>
## Stadia Maps' Terms of Service: <https://stadiamaps.com/terms-of-service/>
## OpenStreetMap's Tile Usage Policy: <https://operations.osmfoundation.org/policies/tiles/>
## ℹ Please cite ggmap if you use it! Use `citation("ggmap")` for details.
london_weekday = read.csv('london_weekdays.csv')
london_weekend = read.csv('london_weekends.csv')
london_weekday <- london_weekday[order(london_weekday$room_type), ]
london_weekday$room_type <- as.numeric(as.factor(london_weekday$room_type))
london_weekday$room_shared = ifelse(london_weekday$room_shared == "False", 0, 1)
london_weekday$room_private = ifelse(london_weekday$room_private == "False", 0, 1)
london_weekday$host_is_superhost = ifelse(london_weekday$host_is_superhost == "False", 0, 1)
london_weekend <- london_weekend[order(london_weekend$room_type), ]
london_weekend$room_type <- as.numeric(as.factor(london_weekend$room_type))
london_weekend$room_shared = ifelse(london_weekend$room_shared == "False", 0, 1)
london_weekend$room_private = ifelse(london_weekend$room_private == "False", 0, 1)
london_weekend$host_is_superhost = ifelse(london_weekend$host_is_superhost == "False", 0, 1)
#cwd <- combined %>%
# filter(day_type == "Weekday")
#cwd <- cwd %>%
# select(-X, -room_type, -room_shared, -room_private, -host_is_superhost, -day_type)
corr_cwd <- cor(london_weekday)
corr_cwd <- reshape2::melt(corr_cwd)
ggplot(corr_cwd %>% filter(Var2 == "realSum" & Var1 != "realSum"), aes(Var1, Var2, fill = value, label = round(value, 2))) +
geom_tile(color = "white") +
geom_text(color = "white", size = 4) +
scale_fill_gradient2(low = "#440154", mid = "#21918C", high = "#FDE725", midpoint = 0, limit = c(-1, 1), space = "Lab", name = "Correlation") +
ggtitle("Correlation Plot for Price of Listings") +
xlab("Variables") + ylab("Price of Listing") +
labs(subtitle = "Weekday Data") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, vjust = 1, size = 10, hjust = 1)) +
coord_fixed()
#cwe <- combined %>%
# filter(day_type == "Weekend") %>%
# select(-X, -room_type, -room_shared, -room_private, -host_is_superhost, -day_type)
corr_cwe <- cor(london_weekend)
corr_cwe <- reshape2::melt(corr_cwe)
ggplot(corr_cwe %>% filter(Var2 == "realSum" & Var1 != "realSum"), aes(Var1, Var2, fill = value, label = round(value, 2))) +
geom_tile(color = "white") +
geom_text(color = "white", size = 4) +
scale_fill_gradient2(low = "#440154", mid = "#21918C", high = "#FDE725", midpoint = 0, limit = c(-1, 1), space = "Lab", name = "Correlation") +
ggtitle("Correlation Plot for Price of Listings") +
xlab("Variables") + ylab("Price of Listing") +
labs(subtitle = "Weekend Data") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, vjust = 1, size = 10, hjust = 1)) +
coord_fixed()
london_weekday = read.csv('london_weekdays.csv')
london_weekend = read.csv('london_weekends.csv')
london_weekday$day_type = 'Weekday'
london_weekend$day_type = 'Weekend'
combined <- rbind(london_weekday, london_weekend)
ggplot(data = combined) +
geom_boxplot(aes(y = realSum)) +
ggtitle("Box Plot of Listing Price") +
xlab("") + ylab("Price per Night (EUR)") +
labs(subtitle = "Price Variable Outliers") +
theme_classic()
ggplot(data = combined) +
geom_boxplot(aes(y = dist)) +
ggtitle("Box Plot of Listing's Distance from City Center") +
xlab("") + ylab("Distance (km)") +
labs(subtitle = "Distance Variable Outliers") +
theme_classic()
ggplot(data = combined) +
geom_bar(aes(x = room_type, y = after_stat(count), fill = room_type)) +
ggtitle("Bar Plot of Room Type") +
xlab("Room Type") + ylab("Count") +
labs(subtitle = "Type of Listing") +
scale_fill_viridis_d() +
theme_classic()
ggplot(data = combined) +
geom_bar(aes(x = person_capacity, y = after_stat(count), fill = as.factor(person_capacity))) +
ggtitle("Bar Plot of Person Capacity") +
xlab("Person Capacity") + ylab("Count") +
labs(subtitle = "Room Capacity") +
scale_fill_viridis_d(name = "Person Capacity") +
theme_classic()
london_cleaned <- combined %>%
filter(room_type != "Shared room") %>%
filter(dist < 13) %>%
filter(realSum < 1000) %>%
select(day_type, lng, lat, room_type, person_capacity, dist, realSum) %>%
mutate(person_capacity = ifelse(person_capacity == 6, 5, as.integer(person_capacity)))
ggplot() +
geom_density(data = london_cleaned, aes(x = realSum, after_stat(count), fill = day_type, group = day_type), alpha = 0.4) +
scale_fill_viridis_d(name = "Part of the Week", direction = -1) +
ggtitle("Density Plot for Price per Night") +
xlab("Price per Night (Pounds)") + ylab("Count") +
labs(subtitle = "Weekday and Weekend Data") +
theme_classic()
ggplot(data = london_cleaned, aes(x = dist, y = realSum, color = as.factor(person_capacity))) +
geom_point(size = 2, shape = 16, alpha = 0.35) +
geom_smooth(method = gam, color = "black", alpha = 0.5) +
facet_grid(day_type~room_type, labeller = label_both) +
scale_color_viridis_d(name = "Room Capacity") +
ggtitle("Scatter Plot of Combined Data") +
xlab("Distance (km)") + ylab("Price (EUR)") +
labs(subtitle = "Distance vs Price, Colored by Room Capacity, Faceted by Room Type and Day Type") +
theme_classic()
## `geom_smooth()` using formula = 'y ~ x'
london_grouped <- london_cleaned %>%
select(room_type, person_capacity, dist, realSum, day_type) %>%
group_by(room_type, person_capacity, day_type) %>%
summarize(avg_distance = mean(dist),
avg_price = mean(realSum),
.groups = 'keep'
)
ggplot(data = london_grouped) +
geom_point(aes(x = person_capacity, y = avg_price, group = room_type, color = room_type)) +
geom_line(aes(x = person_capacity, y = avg_price, group = room_type, color = room_type)) +
facet_grid(~day_type, labeller = label_both) +
scale_color_viridis_d(name = "Room Type") +
ggtitle("Line Plot of Average Price") +
xlab("Room Capacity") + ylab("Average Price") +
labs(subtitle = "Average Values for Price across Room Capacities\nColored by Room Type\nFaceted by Time of the Week") +
theme_classic()
ggplot(data = london_grouped) +
geom_point(aes(x = person_capacity, y = avg_distance, group = room_type, color = room_type)) +
geom_line(aes(x = person_capacity, y = avg_distance, group = room_type, color = room_type)) +
facet_grid(~day_type, labeller = label_both) +
scale_color_viridis_d(name = "Room Type") +
ggtitle("Line Plot of Average Distance") +
xlab("Room Capacity") + ylab("Average Distance") +
labs(subtitle = "Average Values for Distance across Room Capacities\nColored by Room Type\nFaceted by Time of the Week") +
theme_classic()
lwd <- london_cleaned %>%
filter(day_type == "Weekday")
lwe <- london_cleaned %>%
filter(day_type == "Weekend")
wdl <- lwd %>% select(lng, lat)
wdl <- wdl[!duplicated(wdl), ]
wel <- lwe %>% select(lng, lat)
wel <- wel[!duplicated(wel), ]
common_locations <- wdl %>%
inner_join(wel, by = join_by(lng == lng, lat == lat))
london_weekday_common <- lwd %>%
inner_join(common_locations, by = join_by(lng == lng, lat == lat)) %>%
dplyr::rename("weekday_price" = "realSum")
london_weekday_common <- london_weekday_common[!duplicated(london_weekday_common[, c("lng", "lat")]), ]
london_weekend_common <- lwe %>%
inner_join(common_locations, by = join_by(lng == lng, lat == lat)) %>%
dplyr::rename("weekend_price" = "realSum")
london_weekend_common <- london_weekend_common[!duplicated(london_weekend_common[, c("lng", "lat")]), ]
london_common <- london_weekday_common %>%
inner_join(london_weekend_common %>% select(lng, lat, weekend_price), by = join_by(lng == lng, lat == lat)) %>%
select(-day_type)
london_common$price_higher[(london_common$weekday_price - london_common$weekend_price) > 0] <- "Weekdays"
london_common$price_higher[(london_common$weekday_price - london_common$weekend_price) == 0] <- "Same"
london_common$price_higher[(london_common$weekday_price - london_common$weekend_price) < 0] <- "Weekends"
london_common$ratio = london_common$weekday_price / london_common$weekend_price
head(london_common, 5)
## lng lat room_type person_capacity dist weekday_price
## 1 -0.16032 51.46531 Entire home/apt 2 5.301018 570.0981
## 2 -0.09683 51.50343 Private room 2 2.198946 297.9844
## 3 -0.10554 51.52407 Private room 2 2.322958 336.7906
## 4 -0.16575 51.46292 Private room 2 5.707825 226.7222
## 5 -0.12055 51.53728 Private room 3 3.257945 256.3560
## weekend_price price_higher ratio
## 1 567.0406 Weekdays 1.0053920
## 2 296.5733 Weekdays 1.0047581
## 3 335.1443 Weekdays 1.0049123
## 4 225.5462 Weekdays 1.0052138
## 5 281.0508 Weekends 0.9121339
ggplot(data = london_common, aes(x = (weekday_price - weekend_price), y = after_stat(count))) +
geom_density(fill = "#3B528B", alpha = 0.4) +
ggtitle("Density Plot for Price Difference on Weekday and Weekend Data") +
xlab("Count") + ylab("Price Difference") +
labs(subtitle = "Negative -> Price Higher on Weekends \nPositive -> Price Higher on Weekdays") +
theme_classic()
api_key <- "AIzaSyCDXjJr2S8veUhq9yMttKfHTQTYtfSoJRA"
register_google(key = api_key)
london_map <- get_map(location = c(lon = -0.11, lat = 51.5), zoom = 11, color = "bw")
## ℹ <https://maps.googleapis.com/maps/api/staticmap?center=51.5,-0.11&zoom=11&size=640x640&scale=2&maptype=terrain&language=en-EN&key=xxx>
ggmap(london_map)+
geom_point(data = london_common, aes(x = lng, y = lat, color = price_higher, size = abs(weekday_price - weekend_price)), shape = 16, alpha = 0.5) +
scale_color_viridis_d() +
xlab("") +
ylab("") +
theme_classic()
ggplot(data = london_common, aes(x = dist, y = log(ratio), group = room_type, color = as.factor(person_capacity))) +
geom_line() +
facet_grid(room_type~person_capacity, labeller = label_both) +
scale_color_viridis_d(direction = -1) +
ggtitle("Line Plot for Price Ratio between Weekdays and Weekends") +
xlab("Distance (km)") + ylab("Price Ratio") +
labs(subtitle = "Ratio Value: \nNegative -> Price Higher on Weekends \nPositive -> Price Higher on Weekdays") +
theme_minimal()
london_common_grouped <- london_common %>%
select(room_type, person_capacity, dist, weekday_price, weekend_price) %>%
group_by(room_type, person_capacity) %>%
summarize(avg_dist = mean(dist),
avg_weekday_price = mean(weekday_price),
avg_weekend_price = mean(weekend_price),
.groups = 'keep'
)
london_common_grouped
## # A tibble: 8 × 5
## # Groups: room_type, person_capacity [8]
## room_type person_capacity avg_dist avg_weekday_price avg_weekend_price
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Entire home/apt 2 4.85 386. 392.
## 2 Entire home/apt 3 4.73 417. 422.
## 3 Entire home/apt 4 4.80 504. 516.
## 4 Entire home/apt 5 4.90 585. 599.
## 5 Private room 2 5.70 196. 203.
## 6 Private room 3 4.86 231. 237.
## 7 Private room 4 5.69 241. 256.
## 8 Private room 5 5.20 335. 363.
ggplot(data = london_common_grouped, aes(x = person_capacity, y = round((avg_weekday_price / avg_weekend_price), 2), color = room_type)) +
geom_point() +
geom_line() +
facet_wrap(~room_type) +
scale_color_viridis_d(name = "Room Type") +
ggtitle("Line Plot for Average Price Ratio between Weekdays and Weekends") +
xlab("Person Capacity") + ylab("Average Price Ratio") +
labs(subtitle = "Actual Data \nRatio Value: \nValue below 1 -> Price Higher on Weekends \nValue above 1 -> Price Higher on Weekdays") +
theme_classic()
grid = expand.grid(person_capacity = 2:5, room_type = c("Entire home/apt", "Private room"), dist = seq(1, 13, 0.25))
gam_weekday <- gam(data = lwd, formula = realSum ~ s(dist) + room_type * person_capacity, method = "REML")
summary(gam_weekday)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## realSum ~ s(dist) + room_type * person_capacity
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 215.638 8.740 24.673 < 2e-16 ***
## room_typePrivate room -52.573 12.568 -4.183 2.93e-05 ***
## person_capacity 73.537 2.422 30.366 < 2e-16 ***
## room_typePrivate room:person_capacity -53.057 4.580 -11.585 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(dist) 7.274 8.291 91.38 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.626 Deviance explained = 62.7%
## -REML = 27091 Scale est. = 14292 n = 4368
gam_weekend <- gam(data = lwe, formula = realSum ~ s(dist) + room_type * person_capacity, method = "REML")
summary(gam_weekend)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## realSum ~ s(dist) + room_type * person_capacity
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 224.684 7.952 28.253 < 2e-16 ***
## room_typePrivate room -79.835 11.612 -6.875 6.93e-12 ***
## person_capacity 68.805 2.224 30.939 < 2e-16 ***
## room_typePrivate room:person_capacity -37.733 4.268 -8.842 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(dist) 6.608 7.753 114.2 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.604 Deviance explained = 60.5%
## -REML = 31498 Scale est. = 14384 n = 5076
lwd$weekday_pred = augment(gam_weekday)$.fitted
lwe$weekend_pred = augment(gam_weekend)$.fitted
ggplot(data = lwd) +
geom_point(aes(x = dist, y = realSum, color = "Actual Price"), shape = 1, size = 0.5) +
geom_point(aes(x = dist, y = weekday_pred, color = "Predicted Price"), shape = 1, size = 0.5) +
facet_grid(room_type~person_capacity, labeller = label_both) +
scale_color_viridis_d(name = "Price") +
ggtitle("Plot to Show GAM Fit on Data") +
xlab("Distance (km)") + ylab("Price of Listing (EUR)") +
labs(subtitle = "Weekday Data") +
theme_minimal()
ggplot(data = lwe) +
geom_point(aes(x = dist, y = realSum, color = "Actual Price"), shape = 1, size = 0.5) +
geom_point(aes(x = dist, y = weekend_pred, color = "Predicted Price"), shape = 1, size = 0.5) +
facet_grid(room_type~person_capacity, labeller = label_both) +
scale_color_viridis_d(name = "Price") +
ggtitle("Plot to Show GAM Fit on Data") +
xlab("Distance (km)") + ylab("Price of Listing (EUR)") +
labs(subtitle = "Weekend Data") +
theme_minimal()
predict1 = predict(gam_weekday, newdata = grid)
predict2 = predict(gam_weekend, newdata = grid)
result = data.frame(grid, weekday_price = as.vector(predict1), weekend_price = as.vector(predict2))
ggplot(data = result) +
geom_point(aes(x = dist, y = weekday_price, color = "Weekday"), shape = 1, size = 0.5) +
geom_point(aes(x = dist, y = weekend_price, color = "Weekend"), shape = 1, size = 0.5) +
facet_grid(room_type~person_capacity, labeller = label_both) +
scale_color_viridis_d(name = "Time of the Week") +
ggtitle("Plot to Show GAM Predictions") +
xlab("Distance (km)") + ylab("Predicted Price") +
labs(subtitle = "Comparing Weekday and Weekend Trends") +
theme_minimal()
result_grouped <- result %>%
select(room_type, person_capacity, dist, weekday_price, weekend_price) %>%
group_by(room_type, person_capacity) %>%
summarize(avg_dist = mean(dist),
avg_weekday_price = mean(weekday_price),
avg_weekend_price = mean(weekend_price),
.groups = 'keep'
)
ggplot(data = result_grouped, aes(x = person_capacity, y = round((avg_weekday_price / avg_weekend_price), 2), color = room_type)) +
geom_point() +
geom_line() +
facet_wrap(~room_type) +
scale_color_viridis_d(name = "Room Type") +
ggtitle("Line Plot for Average Price Ratio between Weekdays and Weekends") +
xlab("Person Capacity") + ylab("Average Price Ratio") +
labs(subtitle = "GAM Predicted Data \nRatio Value: \nValue below 1 -> Price Higher on Weekends \nValue above 1 -> Price Higher on Weekdays") +
theme_classic()
lwd <- lwd[order(lwd$room_type), ]
lwd$room_type_num <- as.numeric(as.factor(lwd$room_type))
lwe <- lwe[order(lwe$room_type), ]
lwe$room_type_num <- as.numeric(as.factor(lwe$room_type))
mapping <- lwd %>%
select(room_type, room_type_num) %>%
group_by(room_type, room_type_num)
mapping <- mapping[!duplicated(mapping[, c("room_type", "room_type_num")]), ]
loess_weekday <- loess(data = lwd, formula = realSum ~ dist * room_type_num * person_capacity, degree = 2, span = 0.5)
summary(loess_weekday)
## Call:
## loess(formula = realSum ~ dist * room_type_num * person_capacity,
## data = lwd, span = 0.5, degree = 2)
##
## Number of Observations: 4368
## Equivalent Number of Parameters: 18.8
## Residual Standard Error: 118.1
## Trace of smoother matrix: 22.94 (exact)
##
## Control settings:
## span : 0.5
## degree : 2
## family : gaussian
## surface : interpolate cell = 0.2
## normalize: TRUE
## parametric: FALSE FALSE FALSE
## drop.square: FALSE FALSE FALSE
loess_weekend <- loess(data = lwe, formula = realSum ~ dist * room_type_num * person_capacity, degree = 2, span = 0.5)
summary(loess_weekend)
## Call:
## loess(formula = realSum ~ dist * room_type_num * person_capacity,
## data = lwe, span = 0.5, degree = 2)
##
## Number of Observations: 5076
## Equivalent Number of Parameters: 19.18
## Residual Standard Error: 118
## Trace of smoother matrix: 23.43 (exact)
##
## Control settings:
## span : 0.5
## degree : 2
## family : gaussian
## surface : interpolate cell = 0.2
## normalize: TRUE
## parametric: FALSE FALSE FALSE
## drop.square: FALSE FALSE FALSE
lwd$weekday_pred = augment(loess_weekday)$.fitted
lwe$weekend_pred = augment(loess_weekend)$.fitted
ggplot(data = lwd) +
geom_point(aes(x = dist, y = realSum, color = "Actual Price"), shape = 1, size = 0.5) +
geom_point(aes(x = dist, y = weekday_pred, color = "Predicted Price"), shape = 1, size = 0.5) +
facet_grid(room_type~person_capacity, labeller = label_both) +
scale_color_viridis_d(name = "Price") +
ggtitle("Plot to Show Loess Fit on Data") +
xlab("Distance (km)") + ylab("Price of Listing (EUR)") +
labs(subtitle = "Weekday Data") +
theme_minimal()
ggplot(data = lwe) +
geom_point(aes(x = dist, y = realSum, color = "Actual Price"), shape = 1, size = 0.5) +
geom_point(aes(x = dist, y = weekend_pred, color = "Predicted Price"), shape = 1, size = 0.5) +
facet_grid(room_type~person_capacity) +
scale_color_viridis_d(name = "Price") +
ggtitle("Plot to Show Loess Fit on Data") +
xlab("Distance (km)") + ylab("Price of Listing (EUR)") +
labs(subtitle = "Weekend Data") +
theme_minimal()
grid = expand.grid(person_capacity = 2:5, room_type_num = c(1, 2), dist = seq(1, 13, 0.25))
predict1 = predict(loess_weekday, newdata = grid)
predict2 = predict(loess_weekend, newdata = grid)
result = data.frame(grid, weekday_price = as.vector(predict1), weekend_price = as.vector(predict2))
result <- result %>%
inner_join(mapping, by = join_by(room_type_num == room_type_num))
result = na.omit(result)
ggplot(data = result) +
geom_point(aes(x = dist, y = weekday_price, color = "Weekday Price"), shape = 1, size = 0.5) +
geom_point(aes(x = dist, y = weekend_price, color = "Weekend Price"), shape = 1, size = 0.5) +
facet_grid(room_type~person_capacity) +
scale_color_viridis_d(name = "Time of the Week") +
ggtitle("Plot to Show Loess Predictions") +
xlab("Distance (km)") + ylab("Predicted Price") +
labs(subtitle = "Comparing Weekday and Weekend Trends") +
theme_minimal()
result_grouped <- result %>%
select(room_type, person_capacity, dist, weekday_price, weekend_price) %>%
group_by(room_type, person_capacity) %>%
summarize(avg_dist = mean(dist),
avg_weekday_price = mean(weekday_price),
avg_weekend_price = mean(weekend_price),
.groups = 'keep'
)
result_grouped
## # A tibble: 8 × 5
## # Groups: room_type, person_capacity [8]
## room_type person_capacity avg_dist avg_weekday_price avg_weekend_price
## <chr> <int> <dbl> <dbl> <dbl>
## 1 Entire home/apt 2 6.88 343. 338.
## 2 Entire home/apt 3 6.88 385. 377.
## 3 Entire home/apt 4 6.88 465. 455.
## 4 Entire home/apt 5 6.88 566. 555.
## 5 Private room 2 6.88 186. 188.
## 6 Private room 3 6.88 197. 202.
## 7 Private room 4 6.88 221. 240.
## 8 Private room 5 6.88 273. 310.
ggplot(data = result_grouped, aes(x = person_capacity, y = round((avg_weekday_price / avg_weekend_price), 2), color = room_type)) +
geom_point() +
geom_line() +
facet_wrap(~room_type) +
scale_color_viridis_d(name = "Room Type") +
ggtitle("Line Plot for Average Price Ratio between Weekdays and Weekends (Loess Predicted Data)") +
xlab("Person Capacity") + ylab("Average Price Ratio") +
labs(subtitle = "Ratio Value: \nValue below 1 -> Price Higher on Weekends \nValue above 1 -> Price Higher on Weekdays") +
theme_classic()
gam_weekday <- gam(data = lwd, formula = realSum ~ s(dist) + person_capacity, method = "REML")
summary(gam_weekday)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## realSum ~ s(dist) + person_capacity
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 28.206 6.042 4.668 3.13e-06 ***
## person_capacity 105.804 2.071 51.084 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(dist) 7.954 8.714 87.73 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.463 Deviance explained = 46.4%
## -REML = 27892 Scale est. = 20543 n = 4368
gam_weekend <- gam(data = lwe, formula = realSum ~ s(dist) + person_capacity, method = "REML")
summary(gam_weekend)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## realSum ~ s(dist) + person_capacity
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 41.011 5.529 7.417 1.4e-13 ***
## person_capacity 102.337 1.886 54.248 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(dist) 7.232 8.26 103.8 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.454 Deviance explained = 45.5%
## -REML = 32322 Scale est. = 19846 n = 5076
lwd$weekday_pred = augment(gam_weekday)$.fitted
lwe$weekend_pred = augment(gam_weekend)$.fitted
ggplot(data = lwd) +
geom_point(aes(x = dist, y = realSum, color = "Actual Price"), shape = 1, size = 0.5) +
geom_point(aes(x = dist, y = weekday_pred, color = "Predicted Price"), shape = 1, size = 0.5) +
facet_grid(room_type~person_capacity, labeller = label_both) +
scale_color_viridis_d(name = "Room Type") +
ggtitle("Plot to Show GAM Fit on Data") +
xlab("Distance (km)") + ylab("Price of Listing (EUR)") +
labs(subtitle = "Weekday Data") +
theme_minimal()
ggplot(data = lwe) +
geom_point(aes(x = dist, y = realSum, color = "Actual Price"), shape = 1, size = 0.5) +
geom_point(aes(x = dist, y = weekend_pred, color = "Predicted Price"), shape = 1, size = 0.5) +
facet_grid(room_type~person_capacity, labeller = label_both) +
scale_color_viridis_d(name = "Price") +
ggtitle("Plot to Show GAM Fit on Data") +
xlab("Distance (km)") + ylab("Price of Listing (EUR)") +
labs(subtitle = "Weekend Data") +
theme_minimal()
cbbPalette <- c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")
weekdays = read.csv("london_weekdays.csv")
weekends <- read.csv("london_weekends.csv")
df <- rbind(weekends, weekdays)
df$day_type <- ifelse(seq_len(nrow(df)) <= 5379, "Weekend", "Weekday")
#statistics for weekday prices
summary(weekdays$realSum)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 64.68 167.46 256.36 360.23 435.45 15499.89
#statistics for weekend prices
summary(weekends$realSum)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 54.33 174.51 268.12 364.39 438.27 12937.27
#statistics for average distance from city center
summary(df$dist)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.04055 3.54913 4.92412 5.32642 6.83692 17.32121
ecdf(df$dist)(13)
## [1] 0.9792855
cat("Count of all Rooms:", nrow(df))
## Count of all Rooms: 9993
filtered_df <- df %>%
filter(room_type == "Shared room")
count_shared_rooms <- nrow(filtered_df)
cat(" \nCount of Shared Rooms:", count_shared_rooms)
##
## Count of Shared Rooms: 50
filtered_df <- df %>%
filter(room_type == "Shared room" & person_capacity >3 )
count_shared_rooms <- nrow(filtered_df)
cat("Count of rows where room_type is 'Shared room' and person_capacity is greater than 3:", count_shared_rooms, "\n")
## Count of rows where room_type is 'Shared room' and person_capacity is greater than 3: 9
filtered_df <- df %>%
filter(person_capacity == 0 )
listing_with_0_rooms <- nrow(filtered_df)
cat("Count of rows where person_capacity is equal to 1:", listing_with_0_rooms, "\n")
## Count of rows where person_capacity is equal to 1: 0
filtered_df <- df %>%
filter(person_capacity == 1 )
listing_with_1_rooms <- nrow(filtered_df)
cat("Count of rows person_capacity is equal to 1:", listing_with_1_rooms, "\n")
## Count of rows person_capacity is equal to 1: 0
filtered_df <- df %>%
filter(person_capacity == 2 )
listing_with_2_rooms <- nrow(filtered_df)
cat("Count of rows person_capacity is equal to 2:", listing_with_2_rooms, "\n")
## Count of rows person_capacity is equal to 2: 6207
filtered_df <- df %>%
filter(dist > 13 )
listing_farther_than_13_miles <- nrow(filtered_df)
cat("Count of listing which are more than 13 mile from city center:", listing_farther_than_13_miles, "\n")
## Count of listing which are more than 13 mile from city center: 207
ggplot(df, aes(x = as.factor(person_capacity))) +
geom_bar() +
facet_grid(~ day_type) +
labs(title = "Frequency of Person Capacity",
x = "Person Capacity",
y = "Frequency")
# ggplot(df, aes(x = as.factor(person_capacity))) +
# geom_bar() +
# facet_grid(~ room_type) +
# labs(title = "Frequency of Person Capacity by Room Type",
# x = "Person Capacity",
# y = "Frequency")
ggplot(df, aes(x = as.factor(person_capacity))) +
geom_bar() +
facet_grid(room_type ~ day_type) +
labs(title = "Frequency of Person Capacity by Room Type and Day Type",
x = "Person Capacity",
y = "Frequency")
ggplot(df, aes(x = realSum)) +
geom_density(fill = "blue", alpha = 0.5) +
facet_grid(~day_type) +
labs(title = "Smooth Frequency Plot for real_sum",
x = "real_sum",
y = "Density")
ggplot(df, aes(x = dist)) +
geom_density(fill = "blue", alpha = 0.5) +
facet_grid(~day_type) +
labs(title = "Smooth Frequency Plot for real_sum",
x = "real_sum",
y = "Density")
subset_df <- subset(df, realSum < 1000)
subset_df <- subset(subset_df, dist < 13)
subset_df <- subset(subset_df, room_type != "Shared room")
subset_df <- subset_df %>%
mutate(person_capacity = ifelse(person_capacity == 6, 5, as.integer(person_capacity)))
subset_df <- na.omit(subset_df)
head(subset_df)
## X realSum room_type room_shared room_private person_capacity
## 1 0 121.1223 Private room False True 2
## 2 1 195.9124 Private room False True 2
## 3 2 193.3253 Private room False True 3
## 4 3 180.3899 Private room False True 2
## 5 4 405.7010 Entire home/apt False False 3
## 6 5 354.1946 Entire home/apt False False 2
## host_is_superhost multi biz cleanliness_rating guest_satisfaction_overall
## 1 False 0 0 6 69
## 2 False 1 0 10 96
## 3 False 1 0 10 95
## 4 False 1 0 9 87
## 5 False 0 1 7 65
## 6 False 0 1 9 93
## bedrooms dist metro_dist attr_index attr_index_norm rest_index
## 1 1 5.734117 0.4370940 222.8822 15.49341 470.0885
## 2 1 4.788905 1.4640505 235.3858 16.36259 530.1335
## 3 1 4.596677 0.4503062 268.9138 18.69325 548.9876
## 4 1 2.054769 0.1326705 472.3813 32.83707 1021.2711
## 5 0 4.491277 0.3541075 318.4915 22.13958 692.7754
## 6 0 4.467894 0.3507494 321.8646 22.37406 703.0686
## rest_index_norm lng lat day_type
## 1 8.413765 -0.04975 51.52570 Weekend
## 2 9.488466 -0.08475 51.54210 Weekend
## 3 9.825922 -0.14585 51.54802 Weekend
## 4 18.278973 -0.10611 51.52108 Weekend
## 5 12.399473 -0.18797 51.49399 Weekend
## 6 12.583702 -0.18805 51.49473 Weekend
api_key <- "AIzaSyCDXjJr2S8veUhq9yMttKfHTQTYtfSoJRA"
register_google(key = api_key)
london_map <- get_map(location = "London", zoom = 11)
## ℹ <https://maps.googleapis.com/maps/api/staticmap?center=London&zoom=11&size=640x640&scale=2&maptype=terrain&language=en-EN&key=xxx>
## ℹ <https://maps.googleapis.com/maps/api/geocode/json?address=London&key=xxx>
ggmap(london_map)+
geom_point(
data = subset_df,
aes(x = lng, y = lat, color = realSum),
size = 1.3,
alpha = 0.5
) +
scale_color_gradient(name = "Price", low = "#fff200", high = "#301934") + xlab("") + ylab("")
ggplot(subset_df, aes(x = dist, y = realSum)) +
geom_point()+ #+ scale_x_log10() #+ scale_y_log10() +
facet_grid(~day_type) +
geom_smooth(method = "lm", se = FALSE) +
geom_smooth(method = "loess", col = "orange", method.args = list(degree = 1, family = "symmetric"), span = 0.5) #+
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
#geom_smooth(method = "rlm", se = FALSE, col = "pink", method.args = list(psi = psi.bisquare))
ggplot(subset_df, aes(x = dist, y = log(realSum))) +
geom_point()+ #+ scale_x_log10() #+ scale_y_log10() +
facet_grid(~day_type) +
geom_smooth(method = "lm", se = FALSE) +
geom_smooth(method = "loess", col = "orange", method.args = list(degree = 1, family = "symmetric"), span = 0.5) #+
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
#geom_smooth(method = "rlm", se = FALSE, col = "pink", method.args = list(psi = psi.bisquare))
ggplot(subset_df, aes(x = realSum, fill = room_type)) +
geom_histogram(binwidth = 50, position = "identity", alpha = 0.7) +
facet_wrap(~day_type, scales = "free") +
labs(title = "Distribution of Airbnb Prices on Weekends and Weekdays by Room Type",
x = "Price",
y = "Frequency") +
theme_minimal() +
xlim(0, 2000) +
scale_fill_viridis(discrete = TRUE)
## Warning: Removed 8 rows containing missing values (`geom_bar()`).
ggplot(subset_df, aes(x = realSum, fill = room_type)) +
geom_histogram(binwidth = 50, position = "dodge", alpha = 0.7) +
labs(title = "Distribution of Airbnb Prices on Weekends and Weekdays by Room Type",
x = "Price",
y = "Frequency") +
theme_minimal() +
xlim(0, 2000) +
scale_fill_viridis(discrete = TRUE) +
facet_wrap(~day_type)
## Warning: Removed 4 rows containing missing values (`geom_bar()`).
From here we can understand: - On weekends for the same price for a private room, there are more bookings. - In general entire home and apartemnts are less in demand on weekends and weekdays than a private room and for the same price the demand of a entire apartment is more on weekends (as is expected)
average_prices <- subset_df %>%
group_by(room_type, day_type) %>%
summarize(avg_price = mean(realSum))
## `summarise()` has grouped output by 'room_type'. You can override using the
## `.groups` argument.
ggplot(average_prices, aes(x = room_type, y = avg_price, color = room_type)) +
geom_point(position = position_dodge(width = 0.5), size = 7) +
geom_errorbar(
aes(ymin = avg_price - sd(avg_price), ymax = avg_price + sd(avg_price)),
position = position_dodge(width = 0.8),
width = 0.2
) +
labs(title = "Average Price by Room Type",
x = "Room Type",
y = "Average Price",
color = "Room Type") +
theme_minimal() +
scale_color_viridis(discrete = TRUE) +
facet_wrap(~day_type)
average_prices <- subset_df %>%
filter(room_type != "Shared room") %>%
group_by(room_type, day_type) %>%
summarize(avg_price = mean(realSum))
## `summarise()` has grouped output by 'room_type'. You can override using the
## `.groups` argument.
ggplot(average_prices, aes(x = room_type, y = avg_price, color = room_type)) +
geom_point(position = position_dodge(width = 0.5), size = 7) +
geom_errorbar(
aes(ymin = avg_price - sd(avg_price), ymax = avg_price + sd(avg_price)),
position = position_dodge(width = 0.8),
width = 0.2
) +
labs(title = "Average Price by Room Type",
x = "Room Type",
y = "Average Price",
color = "Room Type") +
theme_minimal() +
scale_color_viridis(discrete = TRUE) +
facet_wrap(~day_type)
average_prices <- subset_df %>%
group_by(room_type, cleanliness_rating, day_type) %>%
summarize(avg_price = mean(realSum))
## `summarise()` has grouped output by 'room_type', 'cleanliness_rating'. You can
## override using the `.groups` argument.
ggplot(average_prices, aes(x = cleanliness_rating, y = log(avg_price), color = room_type)) +
geom_point(position = position_dodge(width = 0.5), size = 3) +
geom_line(aes(group = room_type), position = position_dodge(width = 0.5), linetype = "dashed") +
scale_color_viridis_d() +
labs(title = "Average Price for Each Room Type by Cleanliness Rating",
x = "Cleanliness Rating",
y = "Average Price",
color = "Room Type") +
theme_minimal() +
facet_wrap(~day_type)
average_prices_capacity <- subset_df %>%
group_by(room_type, person_capacity, day_type) %>%
summarize(avg_price = mean(realSum))
## `summarise()` has grouped output by 'room_type', 'person_capacity'. You can
## override using the `.groups` argument.
ggplot(average_prices_capacity, aes(x = person_capacity, y = log(avg_price), color = room_type)) +
geom_point(position = position_dodge(width = 0.8), size = 6) +
geom_line(aes(group = room_type), position = position_dodge(width = 0.5), linetype = "dashed") +
scale_color_viridis_d() +
labs(title = "Log Average Price for Each Room Type by Person Capacity",
x = "Person Capacity",
y = " Log Average Price",
color = "Room Type") +
theme_minimal() +
facet_wrap(~day_type)
average_prices_capacity <- subset_df %>%
filter(room_type != "Shared room") %>%
group_by(room_type, person_capacity, day_type) %>%
summarize(avg_price = mean(realSum))
## `summarise()` has grouped output by 'room_type', 'person_capacity'. You can
## override using the `.groups` argument.
ggplot(average_prices_capacity, aes(x = person_capacity, y = log(avg_price), color = room_type)) +
geom_point(position = position_dodge(width = 0.8), size = 6) +
geom_line(aes(group = room_type), position = position_dodge(width = 0.5), linetype = "dashed") +
scale_color_viridis_d() +
labs(title = "Log Average Price for Each Room Type by Person Capacity",
x = "Person Capacity",
y = " Log Average Price",
color = "Room Type") +
theme_minimal() +
facet_wrap(~day_type)
average_prices_distance <- subset_df %>%
group_by(room_type, dist, day_type) %>%
summarize(avg_price = mean(realSum))
## `summarise()` has grouped output by 'room_type', 'dist'. You can override using
## the `.groups` argument.
ggplot(average_prices_distance, aes(x = dist, y = log(avg_price), color = room_type)) +
geom_point(position = position_dodge(width = 0.5), size = 3) +
geom_line(aes(group = room_type), position = position_dodge(width = 0.5), linetype = "dashed") +
labs(title = "Average Price for Each Room Type by Distance",
x = "Distance from City Center",
y = "Average Price",
color = "Room Type") +
theme_minimal() +
facet_grid(room_type ~ day_type)
## Warning: `position_dodge()` requires non-overlapping x intervals
## `position_dodge()` requires non-overlapping x intervals
## `position_dodge()` requires non-overlapping x intervals
## `position_dodge()` requires non-overlapping x intervals
## `position_dodge()` requires non-overlapping x intervals
## `position_dodge()` requires non-overlapping x intervals
## `position_dodge()` requires non-overlapping x intervals
## `position_dodge()` requires non-overlapping x intervals
average_prices_distance <- subset_df %>%
group_by(room_type, dist, day_type) %>%
summarize(avg_price = mean(realSum))
## `summarise()` has grouped output by 'room_type', 'dist'. You can override using
## the `.groups` argument.
ggplot(average_prices_distance, aes(x = dist, y = log(avg_price), color = room_type)) +
geom_point(position = position_dodge(width = 0.5), size = 3) +
geom_smooth(aes(group = room_type), method = "loess", se = FALSE) +
scale_color_viridis_d() + # Set the color palette to Viridis
labs(title = "Average Price for Each Room Type by Distance",
x = "Distance from City Center",
y = "Average Price",
color = "Room Type") +
theme_minimal() +
facet_grid(room_type ~ day_type)
## `geom_smooth()` using formula = 'y ~ x'
## Warning: `position_dodge()` requires non-overlapping x intervals
## `position_dodge()` requires non-overlapping x intervals
## `position_dodge()` requires non-overlapping x intervals
## `position_dodge()` requires non-overlapping x intervals
average_prices_distance <- subset_df %>%
group_by(room_type, dist, day_type) %>%
summarize(avg_price = mean(realSum))
## `summarise()` has grouped output by 'room_type', 'dist'. You can override using
## the `.groups` argument.
ggplot(average_prices_distance, aes(x = dist, y = log(avg_price), color = room_type)) +
geom_point(position = position_dodge(width = 0.5), size = 3, alpha = 0.5) +
geom_smooth(aes(group = room_type), method = "loess", se = FALSE, color = "black") + # Set the color of the smooth line to red
scale_color_viridis_d() +
labs(title = "Log Average Price for Each Room Type by Distance",
x = "Distance from City Center",
y = "Log Average Price",
color = "Room Type") +
theme_minimal() +
facet_grid(room_type ~ day_type)
## `geom_smooth()` using formula = 'y ~ x'
## Warning: `position_dodge()` requires non-overlapping x intervals
## `position_dodge()` requires non-overlapping x intervals
## `position_dodge()` requires non-overlapping x intervals
## `position_dodge()` requires non-overlapping x intervals
average_prices_distance <- subset_df %>%
filter(room_type != "Shared room") %>%
group_by(room_type, dist, day_type) %>%
summarize(avg_price = mean(realSum))
## `summarise()` has grouped output by 'room_type', 'dist'. You can override using
## the `.groups` argument.
ggplot(average_prices_distance, aes(x = dist, y = log(avg_price), color = room_type)) +
geom_point(position = position_dodge(width = 0.5), size = 3, alpha = 0.5) +
geom_smooth(aes(group = room_type), method = "loess", se = FALSE, color = "black") +
scale_color_viridis_d() +
labs(title = "Log Average Price for Each Room Type by Distance",
x = "Distance from City Center",
y = "Log Average Price",
color = "Room Type") +
theme_minimal() +
facet_grid(room_type ~ day_type)
## `geom_smooth()` using formula = 'y ~ x'
## Warning: `position_dodge()` requires non-overlapping x intervals
## `position_dodge()` requires non-overlapping x intervals
## `position_dodge()` requires non-overlapping x intervals
## `position_dodge()` requires non-overlapping x intervals
ggplot(subset_df, aes(x = log(dist), y = log(realSum), color = host_is_superhost)) +
geom_point(alpha = 0.7) +
geom_smooth(method = "auto", se = FALSE, linetype = "dashed") +
labs(title = "Relation between Price and Distance",
x = "Distance from city center",
y = "Price",
color = "Host") +
theme_minimal() +
scale_color_viridis(discrete = TRUE) +
facet_wrap(~day_type)
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
ggplot(combined, aes(x = cleanliness_rating, y = realSum, color = room_type)) +
geom_point(alpha = 0.7) +
geom_smooth(method = "auto", se = FALSE, linetype = "dashed") +
labs(title = "Relation between Cleanliness and Price",
x = "Cleanliness",
y = "Price",
color = "Room Type") +
theme_minimal() +
scale_color_viridis(discrete = TRUE) +
facet_wrap(~day_type)
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
## Warning: Computation failed in `stat_smooth()`
## Computation failed in `stat_smooth()`
## Caused by error in `smooth.construct.cr.smooth.spec()`:
## ! x has insufficient unique values to support 10 knots: reduce k.
Obsevations:
df$bedrooms_cat <- as.factor(df$bedrooms)
ggplot(df, aes(x = cleanliness_rating, y = realSum, color = bedrooms_cat)) +
geom_point(alpha = 0.7) +
geom_smooth(method = "auto", se = FALSE, linetype = "dashed") +
labs(title = "Relation between Cleanliness and Price",
x = "Cleanliness",
y = "Price",
color = "No. of bed rooms") +
theme_minimal() +
scale_color_viridis(discrete = TRUE) +
facet_wrap(~day_type)
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
## Warning: Computation failed in `stat_smooth()`
## Computation failed in `stat_smooth()`
## Caused by error in `smooth.construct.cr.smooth.spec()`:
## ! x has insufficient unique values to support 10 knots: reduce k.
obs: 1. The highest pricses would be for cleanliness 10 or 8 and aribnbs having bedrooms 0 to 3
ggplot(df, aes(x = bedrooms_cat, y = realSum, color = host_is_superhost)) +
geom_point(alpha = 0.7) +
geom_smooth(method = "auto", se = FALSE, linetype = "dashed") +
labs(title = "Relation between Number of bedrooms and Price",
x = "Number of bedroooms",
y = "Price",
color = "Is host superhost") +
theme_minimal() +
scale_color_viridis(discrete = TRUE) +
facet_wrap(~day_type)
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
Obs: 1. It can be seen that the price of airbnbs with same bedrooms
remain more on less same on a the two days. 2. generally on the higher
end of prices there are superhosts.
ggplot(df, aes(x = log(dist), y = log(realSum), color = host_is_superhost)) +
geom_point(alpha = 0.7) +
geom_smooth(method = "auto", se = FALSE, linetype = "dashed") +
labs(title = "Relation between Price and Distance",
x = "Distance from city center",
y = "Price",
color = "Host") +
theme_minimal() +
scale_color_viridis(discrete = TRUE) +
facet_wrap(~day_type)
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
obs: 1. We can see that in generall on weekdays and weekends as the distance increases the price decreasea for an airbnb 2. On both the types of the week, there are more non superhosts which have airbnbs from 2-10 on an higher price range.
ggplot(df, aes(x = cleanliness_rating, y = realSum, color = host_is_superhost)) +
geom_point(alpha = 0.7) +
geom_smooth(method = "auto", se = FALSE, linetype = "dashed") +
labs(title = "Relation between Cleanliness and Price",
x = "Cleanliness",
y = "Price",
color = "Host") +
theme_minimal() +
scale_color_viridis(discrete = TRUE) +
facet_wrap(~day_type)
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
## Warning: Computation failed in `stat_smooth()`
## Computation failed in `stat_smooth()`
## Caused by error in `smooth.construct.cr.smooth.spec()`:
## ! x has insufficient unique values to support 10 knots: reduce k.
ggplot(df, aes(x = cleanliness_rating, y = guest_satisfaction_overall)) +
geom_point(alpha = 0.7) +
geom_smooth(method = "auto", se = FALSE, linetype = "dashed") +
labs(title = "Relation between Cleanliness and Guest Satisfaction",
x = "Cleanliness",
y = "Guest Satisfaction",
color = "Host") +
theme_minimal() +
scale_color_viridis(discrete = TRUE) +
facet_wrap(~day_type)
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
## Warning: Computation failed in `stat_smooth()`
## Computation failed in `stat_smooth()`
## Caused by error in `smooth.construct.cr.smooth.spec()`:
## ! x has insufficient unique values to support 10 knots: reduce k.
ggplot(df, aes(x = cleanliness_rating, y = guest_satisfaction_overall, color = host_is_superhost)) +
geom_point(alpha = 0.7) +
geom_smooth(method = "lm", se = FALSE,) +
labs(title = "Relation between Cleanliness and Guest Satisfaction",
x = "Cleanliness",
y = "Guest Satisfaction",
color = "Host") +
theme_minimal() +
scale_color_viridis(discrete = TRUE) +
facet_wrap(~day_type)
## `geom_smooth()` using formula = 'y ~ x'
filtered_df = df[df['realSum'] < 2500,]
#rm(list = (weekdays,weekends, weekend_data))
dist.model <- lm(realSum ~ dist + person_capacity, data = filtered_df)
summary(dist.model)
##
## Call:
## lm(formula = realSum ~ dist + person_capacity, data = filtered_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -583.21 -104.63 -35.17 61.68 1912.70
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 157.9209 6.7973 23.23 <2e-16 ***
## dist -26.7318 0.7523 -35.53 <2e-16 ***
## person_capacity 115.5685 1.6418 70.39 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 202.2 on 9953 degrees of freedom
## Multiple R-squared: 0.4084, Adjusted R-squared: 0.4083
## F-statistic: 3436 on 2 and 9953 DF, p-value: < 2.2e-16
lniear_model <- augment(dist.model,filtered_df)
ggplot(lniear_model, aes(x = dist, y = .resid, color=person_capacity)) +
geom_point(size=2) + geom_smooth(method = "loess", se=FALSE)+
facet_wrap(day_type ~ ., nrow = 1, ncol = 2) + scale_color_viridis()+
scale_color_viridis()+
ggtitle("Residuals v/s Distance(Linear model)")+
xlab("Distance")+
ylab("Residuals")
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.
## `geom_smooth()` using formula = 'y ~ x'
## Warning: The following aesthetics were dropped during statistical transformation: colour
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
## The following aesthetics were dropped during statistical transformation: colour
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
From the above residual plot, there is a slight curve for the residual
line. We can do better than the existing linear model.
ggplot(lniear_model, aes(x = dist, y = .fitted, color=person_capacity)) +
geom_point(size=2) + geom_smooth(method = "loess", se=FALSE)+
facet_wrap(day_type ~ ., nrow = 1, ncol = 2) + scale_color_viridis()+
scale_color_viridis()+
ggtitle("Fitted Values v/s Distance(Linear model)")+
xlab("Distance")+
ylab("Fitted Values")+ theme_bw()
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.
## `geom_smooth()` using formula = 'y ~ x'
## Warning: The following aesthetics were dropped during statistical transformation: colour
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
## The following aesthetics were dropped during statistical transformation: colour
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
dist.gam.model <- gam(realSum ~ s(dist) + person_capacity, data = filtered_df)
summary(dist.gam.model)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## realSum ~ s(dist) + person_capacity
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 19.856 5.015 3.959 7.58e-05 ***
## person_capacity 113.983 1.620 70.348 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(dist) 8.349 8.886 190.4 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.43 Deviance explained = 43.1%
## GCV = 39431 Scale est. = 39390 n = 9956
gam.df <- augment(dist.gam.model,filtered_df)
ggplot(gam.df, aes(x = dist, y = .resid, color=person_capacity)) +
geom_point(size=2) + geom_smooth(method = "loess", se=FALSE)+
facet_wrap(day_type ~ ., nrow = 1, ncol = 2) + scale_color_viridis()+
ggtitle("Residuals v/s Distance (GAM model)")+
xlab("Distance")+
ylab("Residuals")
## `geom_smooth()` using formula = 'y ~ x'
## Warning: The following aesthetics were dropped during statistical transformation: colour
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
## The following aesthetics were dropped during statistical transformation: colour
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
ggplot(gam.df, aes(x = dist, y = .fitted, color=person_capacity)) +
geom_point(size=2) + geom_smooth(method = "loess", se=FALSE)+
facet_wrap(day_type ~ ., nrow = 1, ncol = 2) + scale_color_viridis()+
ggtitle("Fitted values v/s Distance (GAM model)")+
xlab("Distance")+
ylab("Fitted values")+ theme_bw()
## `geom_smooth()` using formula = 'y ~ x'
## Warning: The following aesthetics were dropped during statistical transformation: colour
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
## The following aesthetics were dropped during statistical transformation: colour
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
ggplot(gam.df, aes(x = log(dist), y = log(realSum), color=person_capacity)) +
geom_point(size=2) +
facet_wrap(day_type ~ ., nrow = 1, ncol = 2) + scale_color_viridis()+
scale_color_viridis()+
ggtitle("Log Price v/s Log Distance")+
xlab("Distance")+
ylab("Price")
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.
rt.model <- lm(realSum ~ dist, data = filtered_df)
summary(rt.model)
##
## Call:
## lm(formula = realSum ~ dist, data = filtered_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -334.03 -159.37 -62.28 85.80 1962.52
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 518.2529 5.4732 94.69 <2e-16 ***
## dist -32.7213 0.9148 -35.77 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 247.5 on 9954 degrees of freedom
## Multiple R-squared: 0.1139, Adjusted R-squared: 0.1138
## F-statistic: 1279 on 1 and 9954 DF, p-value: < 2.2e-16
rt.model <- augment(rt.model,filtered_df)
ggplot(rt.model, aes(x = dist, y = .resid, color=person_capacity)) +
geom_point(size=2) + geom_smooth(method = "loess", se=FALSE)+
facet_wrap(day_type ~ ., nrow = 1, ncol = 2) + scale_color_viridis()+
scale_color_viridis()+
ggtitle("Residuals v/s Distance (Linear model)")+
xlab("Distance")+
ylab("Residuals")
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.
## `geom_smooth()` using formula = 'y ~ x'
## Warning: The following aesthetics were dropped during statistical transformation: colour
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
## The following aesthetics were dropped during statistical transformation: colour
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
ggplot(rt.model, aes(x = dist, y = .fitted, color=person_capacity)) +
geom_point(size=2) + geom_smooth(method = "loess", se=FALSE)+
facet_wrap(day_type ~ ., nrow = 1, ncol = 2) + scale_color_viridis()+
scale_color_viridis()+
ggtitle("Fitted Values v/s Distance (Linear model)")+
xlab("Distance")+
ylab("Fitted values")+ theme_bw()
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.
## `geom_smooth()` using formula = 'y ~ x'
## Warning: The following aesthetics were dropped during statistical transformation: colour
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
## The following aesthetics were dropped during statistical transformation: colour
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
ggplot(gam.df, aes(x = dist, y = .fitted, color=person_capacity)) +
geom_point(size=2) + geom_smooth(method = "loess", se=FALSE)+
facet_wrap(day_type ~ ., nrow = 1, ncol = 2) + scale_color_viridis()+
ggtitle("Fitted values v/s Distance")+
xlab("Distance")+
ylab("Fitted Values")
## `geom_smooth()` using formula = 'y ~ x'
## Warning: The following aesthetics were dropped during statistical transformation: colour
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
## The following aesthetics were dropped during statistical transformation: colour
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
dist.gam.model <- gam(realSum ~ s(dist) , data = filtered_df)
summary(dist.gam.model)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## realSum ~ s(dist)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 343.740 2.434 141.2 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(dist) 8.445 8.916 192.5 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.147 Deviance explained = 14.8%
## GCV = 59025 Scale est. = 58969 n = 9956
gam.df <- augment(dist.gam.model,filtered_df)
ggplot(gam.df, aes(x = dist, y = .resid, color=person_capacity)) +
geom_point(size=2) + geom_smooth(method = "loess", se=FALSE)+
facet_wrap(day_type ~ ., nrow = 1, ncol = 2) + scale_color_viridis()+
ggtitle("Residuals v/s Distance")+
xlab("Distance")+
ylab("Residuals")
## `geom_smooth()` using formula = 'y ~ x'
## Warning: The following aesthetics were dropped during statistical transformation: colour
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
## The following aesthetics were dropped during statistical transformation: colour
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
ggplot(gam.df, aes(x = dist, y = .fitted, color=person_capacity)) +
geom_point(size=2) + geom_smooth(method = "loess", se=FALSE)+
facet_wrap(day_type ~ ., nrow = 1, ncol = 2) + scale_color_viridis()+
ggtitle("Fitted values v/s Distance(GAM model")+
xlab("Distance")+
ylab("Fitted Values")
## `geom_smooth()` using formula = 'y ~ x'
## Warning: The following aesthetics were dropped during statistical transformation: colour
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
## The following aesthetics were dropped during statistical transformation: colour
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
modified <- gam(realSum ~ s(dist) + room_type * person_capacity, data = filtered_df)
modified.df <- augment(modified,filtered_df)
ggplot(modified.df, aes(x = dist, y = .resid, color=person_capacity)) +
geom_point(size=2) + geom_smooth(method = "loess", se=FALSE)+
facet_wrap(day_type ~ ., nrow = 1, ncol = 2) + scale_color_viridis()+
ggtitle("Residuals v/s Distance")+
xlab("Distance")+
ylab("Residuals")
## `geom_smooth()` using formula = 'y ~ x'
## Warning: The following aesthetics were dropped during statistical transformation: colour
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
## The following aesthetics were dropped during statistical transformation: colour
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
ggplot(modified.df, aes(x = dist, y = .fitted, color=person_capacity)) +
geom_point(size=2) + geom_smooth(method = "loess", se=FALSE)+
facet_wrap(person_capacity~., nrow = 2, ncol = 5) + scale_color_viridis()+
ggtitle("Fitted values v/s Distance(GAM model")+
xlab("Distance")+
ylab("Fitted Values")
## `geom_smooth()` using formula = 'y ~ x'
summary(modified)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## realSum ~ s(dist) + room_type * person_capacity
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 192.500 7.789 24.713 < 2e-16 ***
## room_typePrivate room -64.048 11.865 -5.398 6.9e-08 ***
## room_typeShared room -58.539 71.688 -0.817 0.4142
## person_capacity 85.984 2.011 42.766 < 2e-16 ***
## room_typePrivate room:person_capacity -45.098 4.320 -10.438 < 2e-16 ***
## room_typeShared room:person_capacity -62.631 24.841 -2.521 0.0117 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(dist) 8.183 8.825 175.7 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.514 Deviance explained = 51.4%
## GCV = 33672 Scale est. = 33624 n = 9956
# ggplot(df, aes(x = as.factor(person_capacity))) +
# geom_bar() +
# facet_grid(~ room_type) +
# labs(title = "Frequency of Person Capacity by Room Type",
# x = "Person Capacity",
# y = "Frequency")
ggplot(df, aes(x = as.factor(person_capacity))) +
geom_bar() +
facet_grid(room_type ~ day_type) +
labs(title = "Frequency of Person Capacity by Room Type and Day Type",
x = "Person Capacity",
y = "Frequency")
ggplot(df, aes(x = realSum)) +
geom_density(fill = "blue", alpha = 0.5) +
facet_grid(~day_type) +
labs(title = "Smooth Frequency Plot for real_sum",
x = "real_sum",
y = "Density")
ggplot(df, aes(x = dist)) +
geom_density(fill = "blue", alpha = 0.5) +
facet_grid(~day_type) +
labs(title = "Smooth Frequency Plot for real_sum",
x = "real_sum",
y = "Density")
For equal distribution, we should use log on real sum
d1 = subset_df[subset_df$day_type == "Weekday", ]
model1 <- gam(log(realSum) ~ s(dist) + person_capacity * room_type, data = d1, method = "REML")
summary(model1)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## log(realSum) ~ s(dist) + person_capacity * room_type
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.504885 0.024474 224.929 < 2e-16 ***
## person_capacity 0.162348 0.006782 23.940 < 2e-16 ***
## room_typePrivate room -0.453561 0.035194 -12.887 < 2e-16 ***
## person_capacity:room_typePrivate room -0.075474 0.012825 -5.885 4.28e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(dist) 6.392 7.559 127.5 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.668 Deviance explained = 66.9%
## -REML = 1443.7 Scale est. = 0.1121 n = 4368
d2 = subset_df[subset_df$day_type == "Weekend", ]
model2 <- gam(log(realSum) ~ s(dist) + person_capacity * room_type, data = d2, method = "REML")
summary(model2)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## log(realSum) ~ s(dist) + person_capacity * room_type
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.523447 0.021751 253.942 <2e-16 ***
## person_capacity 0.153068 0.006083 25.165 <2e-16 ***
## room_typePrivate room -0.531955 0.031759 -16.750 <2e-16 ***
## person_capacity:room_typePrivate room -0.026236 0.011672 -2.248 0.0246 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(dist) 6.381 7.549 162.6 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.66 Deviance explained = 66.1%
## -REML = 1570.3 Scale est. = 0.10761 n = 5076
Bigger coefficent for distance suggest that distance matters more on weekend when determinig prices.
#residual plot
model1.df <- augment(model1, d1)
ggplot(model1.df, aes(x = dist, y = .resid)) +
geom_jitter(height = 0.25, width = 0.5) +
geom_smooth()
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
model2.df <- augment(model2, d2)
ggplot(model2.df, aes(x = dist, y = .resid)) +
geom_jitter(height = 0.25, width = 0.5) +
geom_smooth()
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
# head(model1.df)
#fitted plot
ggplot(model1.df, aes(x = dist, y = .fitted, color = person_capacity)) +
geom_point(size = 1.5, alpha = 0.2) +
facet_grid(room_type ~ person_capacity, scales = "free")+
#facet_wrap(~day_type , nrow = 1) +
scale_color_viridis()+
ggtitle("Weekday Fitted values based on Distance, room type, and capacity")+
xlab("Distance")+
ylab("Fitted Values")+
theme_minimal()
ggplot(model2.df, aes(x = dist, y = .fitted, color = person_capacity)) +
geom_point(size = 1.5, alpha = 0.2) +
facet_grid(room_type ~ person_capacity, scales = "free")+
#facet_wrap(~day_type , nrow = 1) +
scale_color_viridis()+
ggtitle("Weekend Fitted values based on Distance, room type, and capacity")+
xlab("Distance")+
ylab("Fitted Values")+
theme_minimal()
#homoskedesticity
ggplot(model1.df, aes(x = .fitted, y = abs(.resid))) +
geom_jitter(width = 0.5) +
geom_smooth(method = "lm")
## `geom_smooth()` using formula = 'y ~ x'